{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!wget -c ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/main/hdf5/ag1000g.phase1.ar3.pass.3L.h5\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from math import ceil\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "import h5py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "h5_3L = h5py.File('ag1000g.phase1.ar3.pass.3L.h5', 'r')\n",
    "samples = h5_3L['/3L/samples']\n",
    "calldata_genotype = h5_3L['/3L/calldata/genotype']\n",
    "positions = h5_3L['/3L/variants/POS']\n",
    "alt_alleles = h5_3L['/3L/variants/ALT']\n",
    "is_snp = h5_3L['/3L/variants/is_snp']\n",
    "num_samples = len(samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<HDF5 dataset \"genotype\": shape (9643193, 765, 2), type \"|i1\">\n"
     ]
    }
   ],
   "source": [
    "print(calldata_genotype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<HDF5 dataset \"samples\": shape (765,), type \"|S8\">\n"
     ]
    }
   ],
   "source": [
    "print(samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "window_size = 50000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "last_position = positions[-1]\n",
    "num_windows = ceil(last_position / window_size)\n",
    "limits = np.full((num_windows, 2), -1)\n",
    "curr_window = positions[0] // window_size\n",
    "limits[curr_window, 0] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 9790\n",
      "1000000 11842226\n",
      "2000000 16486667\n",
      "3000000 19736472\n",
      "4000000 23745887\n",
      "5000000 27394730\n",
      "6000000 30373865\n",
      "7000000 33609094\n",
      "8000000 36894938\n",
      "9000000 40144443\n"
     ]
    }
   ],
   "source": [
    "for index, position in enumerate(positions):\n",
    "    my_window = position // window_size\n",
    "    if index % 1000000 == 0:\n",
    "        print(index, position)\n",
    "    if my_window != curr_window:\n",
    "        limits[my_window, 0] = index\n",
    "        limits[curr_window, 1] = index - 1\n",
    "        curr_window = my_window\n",
    "limits[num_windows - 1, 1] = len(positions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0 43] [9642606 9643193]\n"
     ]
    }
   ],
   "source": [
    "print(limits[0], limits[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "839 840\n"
     ]
    }
   ],
   "source": [
    "print(positions[-1] //  window_size, num_windows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calc_statistics(v):\n",
    "    start, end = v[0], v[1]\n",
    "    min_maf = 0.5\n",
    "    max_maf = 0.0\n",
    "    non_bi = 0\n",
    "    missing = 0\n",
    "    non_snp = 0\n",
    "    num_samples = len(samples)\n",
    "    print(v)\n",
    "    for pos in range(start, end + 1):\n",
    "        if not is_snp[pos]:\n",
    "            non_snp += 1\n",
    "            continue\n",
    "        if np.max(calldata_genotype[pos]) > 1:  \n",
    "               non_bi += 1\n",
    "               continue\n",
    "        if np.min(calldata_genotype[pos]) < 0:  \n",
    "               missing += 1\n",
    "               continue\n",
    "        num_alt = np.sum(calldata_genotype[pos])  # Because they are coded as 1\n",
    "        num_ref = num_samples * 2 - num_alt  # (because all are called)\n",
    "        min_called = min(num_ref, num_alt)\n",
    "        maf = min_called / (2 * num_samples)\n",
    "        if maf < min_maf:\n",
    "            min_maf = maf\n",
    "        if maf > max_maf:\n",
    "            max_maf = maf\n",
    "    return {'total': end - start + 1, 'missing': missing,\n",
    "            'non_snp': non_snp, 'non_bi': non_bi,\n",
    "            'min_maf': min_maf, 'max_maf': max_maf}\n",
    "\n",
    "calc_statistics_v = np.vectorize(calc_statistics, signature='(m)->()')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0 43]\n",
      "[ 44 965]\n",
      "[ 966 1912]\n",
      "[1913 3420]\n",
      "[3421 3436]\n",
      "[3437 3803]\n",
      "[3804 5038]\n",
      "[5039 6608]\n",
      "[6609 6801]\n",
      "[6802 7056]\n"
     ]
    }
   ],
   "source": [
    "stats = calc_statistics_v(limits[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([{'total': 44, 'missing': 3, 'non_snp': 0, 'non_bi': 4, 'min_maf': 0.00065359477124183, 'max_maf': 0.0196078431372549},\n",
       "       {'total': 922, 'missing': 153, 'non_snp': 0, 'non_bi': 32, 'min_maf': 0.00065359477124183, 'max_maf': 0.4869281045751634},\n",
       "       {'total': 947, 'missing': 178, 'non_snp': 0, 'non_bi': 37, 'min_maf': 0.00065359477124183, 'max_maf': 0.4516339869281046},\n",
       "       {'total': 1508, 'missing': 77, 'non_snp': 0, 'non_bi': 49, 'min_maf': 0.00065359477124183, 'max_maf': 0.39477124183006534},\n",
       "       {'total': 16, 'missing': 0, 'non_snp': 0, 'non_bi': 2, 'min_maf': 0.00065359477124183, 'max_maf': 0.3235294117647059},\n",
       "       {'total': 367, 'missing': 8, 'non_snp': 0, 'non_bi': 10, 'min_maf': 0.0, 'max_maf': 0.4503267973856209},\n",
       "       {'total': 1235, 'missing': 8, 'non_snp': 0, 'non_bi': 54, 'min_maf': 0.0, 'max_maf': 0.3954248366013072},\n",
       "       {'total': 1570, 'missing': 32, 'non_snp': 0, 'non_bi': 68, 'min_maf': 0.00065359477124183, 'max_maf': 0.3934640522875817},\n",
       "       {'total': 193, 'missing': 0, 'non_snp': 0, 'non_bi': 4, 'min_maf': 0.00065359477124183, 'max_maf': 0.2549019607843137},\n",
       "       {'total': 255, 'missing': 39, 'non_snp': 0, 'non_bi': 16, 'min_maf': 0.00065359477124183, 'max_maf': 0.4117647058823529}],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(stats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
